library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ stringr 1.4.0
## ✓ tidyr   1.1.4     ✓ forcats 0.5.1
## ✓ readr   2.1.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(knitr)
library(formatR)
library(skimr)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(lm.beta) 
library(lsr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(plot3D)
library(dotwhisker)
## Registered S3 method overwritten by 'parameters':
##   method                         from      
##   format.parameters_distribution datawizard
knitr::opts_chunk$set(
echo = TRUE, # render output of code chunks,
warning = TRUE, # do not suppress "warnings"
message = TRUE, # do not suppress "messages"
title="#",
comment = "##", # prefix for comment lines
tidy = TRUE,
tidy.opts = list(blank = FALSE, width.cutoff = 75),
fig.path = "Kim et_al_photos/", # name of folder for images
fig.align = "center" # centers any images on the page
)
# Begin working on descriptive Stats
f <- "https://raw.githubusercontent.com/mrpickett26/Individual-Data-Analysis-Replication/main/tumor%20location%20raw%20data_final.csv"
d <- read_csv(f, col_names = TRUE)
## Rows: 1102 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Size_Pathol_cm_
## dbl (13): Subtype, Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(d)
## # A tibble: 6 × 15
##      ID Size_Pathol_cm_ Subtype Qudrants2_1_UOQ_2_UIQ_3_… MR_Location_ML__1__me…
##   <dbl> <chr>             <dbl>                     <dbl>                  <dbl>
## 1     1 1.50                  1                         2                      2
## 2     2 3.00                  1                         2                      2
## 3     3 1.60                  1                         1                      2
## 4     4 1.80                  1                         5                      2
## 5     5 3.00                  1                         1                      2
## 6     6 3.50                  3                         1                      2
## # … with 10 more variables:
## #   MR_Location_AP__1__anterior_2__middle_3__posterior_ <dbl>,
## #   Y_distance <dbl>, X_normalized <dbl>, Y_normalized <dbl>,
## #   Z_normalized <dbl>, MG_grade <dbl>, X_distance <dbl>, Z_distance <dbl>,
## #   Palpability <dbl>, Histologic_full <dbl>
skim(d)  #getting some quick stats before filtering... based on raw data a lot of filtering needs to occur but useful for a couple variables
Data summary
Name d
Number of rows 1102
Number of columns 15
_______________________
Column type frequency:
character 1
numeric 14
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Size_Pathol_cm_ 0 1 4 6 0 67 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ID 0 1 551.50 318.26 1.0 276.25 551.5 826.75 1102.0 ▇▇▇▇▇
Subtype 0 1 1.46 0.84 1.0 1.00 1.0 1.00 3.0 ▇▁▁▁▂
Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping 0 1 2.05 1.25 1.0 1.00 2.0 3.00 5.0 ▇▅▂▁▂
MR_Location_ML__1__medial_2__central_3__lateral_ 0 1 2.20 0.77 1.0 2.00 2.0 3.00 3.0 ▅▁▇▁▇
MR_Location_AP__1__anterior_2__middle_3__posterior_ 0 1 2.37 0.64 1.0 2.00 2.0 3.00 3.0 ▂▁▇▁▇
Y_distance 0 1 2.19 1.88 0.0 0.70 1.8 3.18 10.1 ▇▃▂▁▁
X_normalized 0 1 0.05 0.18 -0.4 -0.10 0.1 0.20 0.5 ▁▅▇▇▁
Y_normalized 0 1 0.24 0.18 0.0 0.10 0.2 0.40 0.9 ▇▇▅▁▁
Z_normalized 0 1 0.06 0.13 -0.4 0.00 0.1 0.20 0.5 ▁▃▇▃▁
MG_grade 0 1 2.78 0.84 1.0 2.00 3.0 3.00 4.0 ▂▃▁▇▃
X_distance 0 1 0.45 1.51 -4.2 -0.70 0.5 1.60 5.5 ▁▅▇▅▁
Z_distance 0 1 0.97 2.05 -5.5 -0.38 1.0 2.50 7.1 ▁▃▇▅▁
Palpability 0 1 0.60 0.49 0.0 0.00 1.0 1.00 1.0 ▅▁▁▁▇
Histologic_full 0 1 2.30 0.67 1.0 2.00 2.0 3.00 3.0 ▂▁▇▁▇
knitr::opts_chunk$set(fig.path = "Kim_et_al_photos/")
# DESCRIPTIVE STATISTICS
## Want to separate data out into the two different subtypes (ER and
## Triple Negative), key factor in the paper. In this section I focus on
## the Triple Negative subtype. From this filtered dataset, I create
## variables for the number (counts) of each histological grade as well as
## the percentage of each histological grade within the population. I
## wrote a function that included n in it because later on, I will have to
## filter out some of the NAs that are in the raw dataset and this made it
## easier.
Triple_neg <- filter(d, Subtype == "3")
n <- nrow(Triple_neg)
T_Hist_grade_1 <- (length(which(Triple_neg$Histologic_full == "1")))
T_hist_grade_2 <- (length(which(Triple_neg$Histologic_full == "2")))
T_hist_grade_3 <- (length(which(Triple_neg$Histologic_full == "3")))
percent <- function(x) {
    perc <- x/256
    return(perc)
}
T_hist_grade1_percent <- percent(T_Hist_grade_1)
T_hist_grade2_percent <- percent(T_hist_grade_2)
T_hist_grade3_percent <- percent(T_hist_grade_3)
## T_Hist_grade tells the number of patients whose identified cancers
## belong to each type.
## Here I replace the #NULL! with NA so that I can filter it out into a
## new dataframe. I then convert it into a numeric so that it can be
## passed through the built in mean function.
ans <- Triple_neg %>%
    replace(. == "#NULL!", NA)
g <- ans[!is.na(ans$Size_Pathol_cm_), ]
Mean_Size_T <- as.numeric(g$Size_Pathol_cm_)
Mean_size_Trip_neg <- mean(Mean_Size_T)
Mean_size_Trip_neg
## [1] 2.651528
## After finding the mean of the size of the triple negative tumor in cm
## after omitting all #NULL! values, I think the data set in the paper
## maybe replaced the Null values with 0 or used the n of 256 to calculate
## the mean because my mean is slightly higher
Range_size_trip_neg <- range(Mean_Size_T)
Range_size_trip_neg
## [1] 0.2 9.3
# find the range of values for the tumor size. The paper says that the
# range of the values for tumor size are (0.2-5.4), whereas my
# calculations indicate that the range extends up to 9.3.
# Palpability
Triple_neg$Palpability[Triple_neg$Palpability == "1"] <- "No"
Triple_neg$Palpability[Triple_neg$Palpability == "0"] <- "Yes"
## Changed the 1 and 0s found in Palpability to Yes and No values, and now
## count the number of yes and no values
No <- (length(which(Triple_neg$Palpability == "No")))
Yes <- (length(which(Triple_neg$Palpability == "Yes")))
Percent_palp <- percent(Yes)
Percent_not_palp <- percent(No)
## Calculated the number and percentage of palpable vs not palpable tumors
## in the Triple Negative subtype, using the function as defined above. I
## thought it was easiest to assign them into new variable in order to
## format them into the table properly.
## Now I will assign the numeric positional data to new values which will
## make it easier to filter out the data and do further analyses, again I
## thought variable assignment was the best for later calculations.
UOQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "1")))
percent_UOQ <- percent(UOQ)
UIQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "2")))
percent_UIQ <- percent(UIQ)
LOQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "3")))
percent_LOQ <- percent(LOQ)
LIQ <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "4")))
percent_LIQ <- percent(LIQ)
periareolar <- (length(which(Triple_neg$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "5")))
percent_periareolar <- percent(periareolar)
## Doing the assignments for mediolateral locations
medial <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "1")))
percent_medial <- percent(medial)
central <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "2")))
percent_central <- percent(central)
lateral <- (length(which(Triple_neg$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "3")))
percent_lateral <- percent(lateral)
## Doing the same assignments for anteroposterior locations
anterior <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "1")))
percent_anterior <- percent(anterior)
middle <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "2")))
percent_middle <- percent(middle)
posterior <- (length(which(Triple_neg$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "3")))
percent_posterior <- percent(posterior)
## Now I will filter the data for ER and conduct descriptive stats on it
ER_pos <- filter(d, Subtype == "1")
n_E <- nrow(ER_pos)
## Histological Grade filtering, and wrote a new function for percent to
## account for the change in n.
T_Hist_grade_1_E <- (length(which(ER_pos$Histologic_full == "1")))
T_hist_grade_2_E <- (length(which(ER_pos$Histologic_full == "2")))
T_hist_grade_3_E <- (length(which(ER_pos$Histologic_full == "3")))
percentE <- function(x) {
    perc <- x/846
    return(perc)
}
T_hist_grade1E_percent <- percentE(T_Hist_grade_1_E)
T_hist_grade2E_percent <- percentE(T_hist_grade_2_E)
T_hist_grade3E_percent <- percentE(T_hist_grade_3_E)
## In this, much like in the TN case, I replaced the NULL values with NA
## to remove them from the dataframe to make for easier wrangling.
ans <- ER_pos %>%
    replace(. == "#NULL!", NA)
Eg <- ans[!is.na(ans$Size_Pathol_cm_), ]
Mean_Size_E <- as.numeric(Eg$Size_Pathol_cm_)
Mean_size_ER <- mean(Mean_Size_E)
Mean_size_ER
## [1] 2.410102
Range_size_ER <- range(Eg$Size_Pathol_cm_)
Range_size_ER
## [1] "0.20" "9.80"
## Again, the values I got for both the range and mean of the subtype
## differ from what is reported. Could be attributed to replacement with
## zeros or averaging with filtered data. Unsure why the range is
## different than reported.... The filtering was done correctly.
## Now ran the statistics for the palpability of ER data.
No_ER <- (length(which(ER_pos$Palpability == "1")))
Yes_ER <- (length(which(ER_pos$Palpability == "0")))
Percent_palpE <- percentE(Yes_ER)
Percent_not_palpE <- percentE(No_ER)
## Now filter out the ER data based on each quadrant and assign variables
## to each to put into a table. Calculate the percentages as well based on
## the function that I wrote previously.
UOQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "1")))
percent_UOQ_E <- percentE(UOQ_E)
UIQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "2")))
percent_UIQ_E <- percent(UIQ_E)
LOQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "3")))
percent_LOQ_E <- percentE(LOQ_E)
LIQ_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "4")))
percent_LIQ_E <- percentE(LIQ_E)
periareolar_E <- (length(which(ER_pos$Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overlapping ==
    "5")))
percent_periareolar_E <- percentE(periareolar_E)
## Do the same for mediolateral locations. Assign valulations to variables
## to be put in a table.
medial_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "1")))
percent_medial_E <- percentE(medial_E)
central_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "2")))
percent_central_E <- percentE(central_E)
lateral_E <- (length(which(ER_pos$MR_Location_ML__1__medial_2__central_3__lateral_ ==
    "3")))
percent_lateral_E <- percent(lateral_E)
# ER now do the same assignments for anteroposterior locations
anterior_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "1")))
percent_anterior_E <- percentE(anterior_E)
middle_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "2")))
percent_middle_E <- percentE(middle_E)
posterior_E <- (length(which(ER_pos$MR_Location_AP__1__anterior_2__middle_3__posterior_ ==
    "3")))
percent_posterior_E <- percentE(posterior_E)
# Make variables into a table that matches the one in the paper.
a <- c("Palpability_no", "Palpability_yes", "Upper Outer Quadrant", "Upper Inner Quadrant",
    "Lower Outer Quadrant", "Lower Inner Quadrant", "Periareolar", "Mediolateral_Medial",
    "Mediolateral_Central", "Mediolateral_Lateral", "Anteroposterior_Anterior",
    "Anteroposterior_Middle", "Anteroposterior_Posterior", "Histological Grade 1",
    "Histological Grade 2", "Histological Grade 3")
b <- c(Yes, No, UOQ, UIQ, LOQ, LIQ, periareolar, medial, central, lateral, anterior,
    middle, posterior, T_Hist_grade_1, T_hist_grade_2, T_hist_grade_3)
c <- 100 * c(Percent_palp, Percent_not_palp, percent_UOQ, percent_UIQ, percent_LOQ,
    percent_LIQ, percent_periareolar, percent_medial, percent_central, percent_lateral,
    percent_anterior, percent_middle, percent_posterior, T_hist_grade1_percent,
    T_hist_grade2_percent, T_hist_grade3_percent)
d <- c(Yes_ER, No_ER, UOQ_E, UIQ_E, LOQ_E, LIQ_E, periareolar_E, medial_E, central_E,
    lateral_E, anterior_E, middle_E, posterior_E, T_Hist_grade_1_E, T_hist_grade_2_E,
    T_hist_grade_3_E)
e <- 100 * c(Percent_palpE, Percent_not_palpE, percent_UOQ_E, percent_UIQ_E,
    percent_LOQ_E, percent_LIQ_E, percent_periareolar_E, percent_medial_E, percent_central_E,
    percent_lateral_E, percent_anterior_E, percent_middle_E, percent_posterior_E,
    T_hist_grade1E_percent, T_hist_grade2E_percent, T_hist_grade3E_percent)
bind_col <- data.frame(Variables = a, Triple_Negative_Counts = b, Triple_Negative_Percentages = c,
    ER_Counts = d, ER_Percentages = e)
kable(bind_col, digits = 3) %>%
    kable_styling(font_size = 12, full_width = FALSE)
Variables Triple_Negative_Counts Triple_Negative_Percentages ER_Counts ER_Percentages
Palpability_no 60 23.438 379 44.799
Palpability_yes 196 76.562 467 55.201
Upper Outer Quadrant 126 49.219 375 44.326
Upper Inner Quadrant 73 28.516 225 87.891
Lower Outer Quadrant 26 10.156 112 13.239
Lower Inner Quadrant 18 7.031 61 7.210
Periareolar 13 5.078 73 8.629
Mediolateral_Medial 51 19.922 186 21.986
Mediolateral_Central 100 39.062 310 36.643
Mediolateral_Lateral 105 41.016 350 136.719
Anteroposterior_Anterior 17 6.641 80 9.456
Anteroposterior_Middle 99 38.672 396 46.809
Anteroposterior_Posterior 140 54.688 370 43.735
Histological Grade 1 2 0.781 125 14.775
Histological Grade 2 35 13.672 478 56.501
Histological Grade 3 219 85.547 243 28.723
bind_col
##                    Variables Triple_Negative_Counts Triple_Negative_Percentages
## 1             Palpability_no                     60                   23.437500
## 2            Palpability_yes                    196                   76.562500
## 3       Upper Outer Quadrant                    126                   49.218750
## 4       Upper Inner Quadrant                     73                   28.515625
## 5       Lower Outer Quadrant                     26                   10.156250
## 6       Lower Inner Quadrant                     18                    7.031250
## 7                Periareolar                     13                    5.078125
## 8        Mediolateral_Medial                     51                   19.921875
## 9       Mediolateral_Central                    100                   39.062500
## 10      Mediolateral_Lateral                    105                   41.015625
## 11  Anteroposterior_Anterior                     17                    6.640625
## 12    Anteroposterior_Middle                     99                   38.671875
## 13 Anteroposterior_Posterior                    140                   54.687500
## 14      Histological Grade 1                      2                    0.781250
## 15      Histological Grade 2                     35                   13.671875
## 16      Histological Grade 3                    219                   85.546875
##    ER_Counts ER_Percentages
## 1        379      44.799054
## 2        467      55.200946
## 3        375      44.326241
## 4        225      87.890625
## 5        112      13.238771
## 6         61       7.210402
## 7         73       8.628842
## 8        186      21.985816
## 9        310      36.643026
## 10       350     136.718750
## 11        80       9.456265
## 12       396      46.808511
## 13       370      43.735225
## 14       125      14.775414
## 15       478      56.501182
## 16       243      28.723404
knitr::include_graphics("Kim_et_al_photos/count_table.jpg")

knitr::include_graphics("Kim_et_al_photos/location_table.jpg")

# More Descriptive Stats
## Need to filter out the normalized distance from the chest wall and
## distance from the chest wall, need to relplace the NULL with NA in
## order to have a tidy dataset that I can use for future calculations. I
## decided to filter all of the positional data, so that I could use it
## for visualization in the next step.
Triple_neg_normY <- Triple_neg$Y_normalized %>%
    replace(. == "#NULL!", NA)
Triple_neg_normY <- na.omit(Triple_neg_normY)
m_Triple_neg_normY <- mean(Triple_neg_normY)
Triple_neg_normX <- Triple_neg$X_normalized %>%
    replace(. == "#NULL!", NA)
Triple_neg_normX <- na.omit(Triple_neg_normX)
m_Triple_neg_normX <- mean(Triple_neg_normX)
Triple_neg_normZ <- Triple_neg$Z_normalized %>%
    replace(. == "#NULL!", NA)
Triple_neg_normZ <- na.omit(Triple_neg_normZ)
m_Triple_neg_normZ <- mean(Triple_neg_normZ)
Triple_neg_Y <- Triple_neg$Y_distance %>%
    replace(. == "#NULL!", NA)
Triple_neg_Y <- na.omit(Triple_neg_Y)
m_Triple_neg_Y <- mean(Triple_neg_Y)
Triple_neg_X <- Triple_neg$X_distance %>%
    replace(. == "#NULL!", NA)
Triple_neg_X <- na.omit(Triple_neg_X)
m_Triple_neg_X <- mean(Triple_neg_X)
Triple_neg_Z <- Triple_neg$Z_distance %>%
    replace(. == "#NULL!", NA)
Triple_neg_Z <- na.omit(Triple_neg_Z)
m_Triple_neg_Z <- mean(Triple_neg_Z)
## Do the same positional filtering for ER data.
ER_normY <- ER_pos$Y_normalized %>%
    replace(. == "#NULL!", NA)
ER_normY <- na.omit(ER_normY)
m_ER_normY <- mean(ER_normY)
ER_normX <- ER_pos$X_normalized %>%
    replace(. == "#NULL!", NA)
ER_normX <- na.omit(ER_normX)
m_ER_normX <- mean(ER_normX)
ER_normZ <- ER_pos$Z_normalized %>%
    replace(. == "#NULL!", NA)
ER_normZ <- na.omit(ER_normZ)
m_ER_normZ <- mean(ER_normZ)
ER_Y <- ER_pos$Y_distance %>%
    replace(. == "#NULL!", NA)
ER_Y <- na.omit(ER_Y)
m_ER_Y <- mean(ER_Y)
ER_X <- ER_pos$X_distance %>%
    replace(. == "#NULL!", NA)
ER_X <- na.omit(ER_X)
m_ER_X <- mean(ER_X)
ER_Z <- ER_pos$Z_distance %>%
    replace(. == "#NULL!", NA)
ER_Z <- na.omit(ER_Z)
m_ER_Z <- mean(ER_Z)
## Begin doing inferential stats, assume normal distribution for standard
## error and confidence intervals. I created a function for both of them
## and will use it to define variables in the future.
SE <- function(x, type = "normal") {
    if (type == "normal") {
        SE <- sd(x)/sqrt(length(x))
    }
    if (type == "poisson") {
        SE <- sqrt(mean(x)/length(x))
        # mean(x) is estimate of lambda
    }
    return(SE)
}
CI_norm <- function(x, alpha = 0.05) {
    CI <- mean(x) + c(-1, 1) * qnorm(1 - alpha/2) * SE(x)
    # confidence interval based on normal distribution
    names(CI) <- c("CI norm L", "CI norm U")
    return(CI)
}
## Now take the data and filter it by positional filters and pass through
## the functions needed to get the variables of interest for the summary
## table that will later be created.
ER_means <- ER_pos %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~mean(.)))
ER_SD <- ER_pos %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~sd(.)))
ER_SEs <- ER_pos %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~SE(., type = "normal")))
ER_CI_norm <- ER_pos %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~CI_norm(.)))
TN_means <- Triple_neg %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~mean(.)))
TN_SD <- Triple_neg %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~sd(.)))
TN_SEs <- Triple_neg %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~SE(., type = "normal")))
TN_CI_norm <- Triple_neg %>%
    dplyr::summarise(across(.cols = c(Y_distance, X_distance, Z_distance, Y_normalized,
        X_normalized, Z_normalized), .fns = ~CI_norm(.)))
## Not perform t-tests on on the variables/filtered data in order to
## extract the p-values needed for the tables.
X_norm <- t.test(ER_normX, Triple_neg_normX)
p_x_norm <- X_norm$p.value
Y_norm <- t.test(ER_normY, Triple_neg_normY)
p_y_norm <- Y_norm$p.value
Z_norm <- t.test(ER_normZ, Triple_neg_normZ)
p_Z_norm <- Z_norm$p.value
Z_dist <- t.test(ER_Z, Triple_neg_Z)
p_Z <- Z_dist$p.value
x_dist <- t.test(ER_X, Triple_neg_X)
p_x <- x_dist$p.value
y_dist <- t.test(ER_Y, Triple_neg_Y)
p_y <- y_dist$p.value
## Create the second descriptive table found in the paper
samp_1_summary <- as_tibble(t(bind_rows(ER_means, TN_means, ER_SD, TN_SD, ER_SEs,
    TN_SEs, ER_CI_norm, TN_CI_norm)), .name_repair = "minimal")
names(samp_1_summary) <- c(" ER Mean", "TN Mean", "ER SD", "TN SD", "ER Std Error",
    "TN Std Error", "ER Confidence Interval L", "ER Confidence Interval U",
    "TN Confidence Interval L", "TN Confidence Interval U")
variables <- tibble(Variable = c("Y_distance", "X_distance", "Z_distance", "Y_normalized",
    "X_normalized", "Z_normalized"))
samp_summary <- bind_cols(variables, samp_1_summary)
kable(samp_1_summary, digits = 3) %>%
    kable_styling(font_size = 12, full_width = FALSE)
ER Mean TN Mean ER SD TN SD ER Std Error TN Std Error ER Confidence Interval L ER Confidence Interval U TN Confidence Interval L TN Confidence Interval U
2.300 1.841 1.932 1.645 0.066 0.103 2.169 2.430 1.640 2.043
0.430 0.498 1.516 1.491 0.052 0.093 0.328 0.533 0.316 0.681
0.912 1.179 2.021 2.140 0.069 0.134 0.776 1.048 0.917 1.441
0.253 0.209 0.183 0.170 0.006 0.011 0.241 0.265 0.188 0.229
0.054 0.057 0.176 0.172 0.006 0.011 0.042 0.066 0.036 0.079
0.059 0.075 0.130 0.139 0.004 0.009 0.050 0.067 0.058 0.092
samp_summary
## # A tibble: 6 × 11
##   Variable    ` ER Mean` `TN Mean` `ER SD` `TN SD` `ER Std Error` `TN Std Error`
##   <chr>            <dbl>     <dbl>   <dbl>   <dbl>          <dbl>          <dbl>
## 1 Y_distance      2.30      1.84     1.93    1.65         0.0664         0.103  
## 2 X_distance      0.430     0.498    1.52    1.49         0.0521         0.0932 
## 3 Z_distance      0.912     1.18     2.02    2.14         0.0695         0.134  
## 4 Y_normaliz…     0.253     0.209    0.183   0.170        0.00630        0.0106 
## 5 X_normaliz…     0.0538    0.0574   0.176   0.172        0.00605        0.0108 
## 6 Z_normaliz…     0.0585    0.0746   0.130   0.139        0.00446        0.00867
## # … with 4 more variables: ER Confidence Interval L <dbl>,
## #   ER Confidence Interval U <dbl>, TN Confidence Interval L <dbl>,
## #   TN Confidence Interval U <dbl>
knitr::include_graphics("Kim_et_al_photos/distance_table.jpg")

# Inferential Stats Do inferential stats for the paper, no distinction
# between TN and ER for calculations for the beta coefficients Reading in
# the data again because so many variables were defined before that it is
# just simpler for my brain to refilter based off of what is needed from
# the total dataset
f <- "https://raw.githubusercontent.com/mrpickett26/Individual-Data-Analysis-Replication/main/tumor%20location%20raw%20data_final.csv"
d <- read_csv(f, col_names = TRUE)
## Rows: 1102 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Size_Pathol_cm_
## dbl (13): Subtype, Qudrants2_1_UOQ_2_UIQ_3_LOQ_4_LIQ_5_periareolar_6___overl...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ans2 <- d %>%
    replace(. == "#NULL!", NA)
ans2 <- na.omit(ans2)
## Redefine variables from the filtered dataset and convert them to
## numeric so that I can do further calculations on them more easily
Size_pathol <- as.numeric(ans2$Size_Pathol_cm_)
Hist_grade <- as.numeric(ans2$Histologic_full)
Subtype <- as.numeric(ans2$Subtype)
## Define the absolute and normalized y distances as numeric variable for
## use in later calculations
normY_d <- ans2$Y_normalized  ## normalized
Y_d <- ans2$Y_distance  ##Absolute
## Hand calculate the beta coefficients of normalized y-axis and tumor
## size before doing any of the built in calculations.
beta1_size <- cor(normY_d, Size_pathol) * (sd(normY_d)/sd(Size_pathol))
beta1_size
## [1] -0.008155774
(beta0_size <- mean(normY_d) - beta1_size * mean(Size_pathol))
## [1] 0.2633268
beta0_size  #do hand calculations of beta0 for normalized y distance am size of tumor
## [1] 0.2633268
## Now do standard error of b1 and b0 for normalized y distance am size of
## tumor by hand
n <- nrow(ans2)
df <- n - 2
mean_x <- mean(Size_pathol)
y_pred = beta0_size + beta1_size * normY_d
y_error = normY_d - y_pred
std_err_b1_dist <- sqrt((sum(y_error^2))/((n - 2) * sum((Size_pathol - mean_x)^2)))
std_err_b1_dist
## [1] 0.004522476
## I will use the lm function to evaluate the rest of the variables BUT if
## I was to do it by hand, it would follow the formats about for
## beta1_size, beta1_size_norm, beta0_size, and beta0_size_norm
## Now utilize the lm function to double check calculations and calculate
## the beta coefficients of absolute/normalized against selected variables
## (subtype, size, histological grade). I chose subtype because it is
## foundational to the paper, size becuase the data is continuous, and
## histological grade because it was good for me to practice categorical
## data analysis and it was in a relatively clean format.
m_size_path_norm <- lm(normY_d ~ Size_pathol, data = ans2)
summary(m_size_path_norm)
## 
## Call:
## lm(formula = normY_d ~ Size_pathol, data = ans2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26170 -0.14620 -0.04294  0.14483  0.64646 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.263327   0.012372   21.29   <2e-16 ***
## Size_pathol -0.008156   0.004457   -1.83   0.0675 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.181 on 1009 degrees of freedom
## Multiple R-squared:  0.003308,   Adjusted R-squared:  0.00232 
## F-statistic: 3.349 on 1 and 1009 DF,  p-value: 0.06754
std_err_pn <- coef(summary(m_size_path_norm))[, "Std. Error"]
m_Hist_grade_norm <- lm(normY_d ~ Hist_grade, data = ans2)
summary(m_Hist_grade_norm)  #This one seems to differ 
## 
## Call:
## lm(formula = normY_d ~ Hist_grade, data = ans2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27989 -0.15185 -0.02381  0.12011  0.67619 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.307928   0.020476  15.039  < 2e-16 ***
## Hist_grade  -0.028039   0.008526  -3.289  0.00104 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1803 on 1009 degrees of freedom
## Multiple R-squared:  0.01061,    Adjusted R-squared:  0.009625 
## F-statistic: 10.82 on 1 and 1009 DF,  p-value: 0.001041
std_err_hgn <- coef(summary(m_Hist_grade_norm))[, "Std. Error"]
m_Subtype_norm <- lm(normY_d ~ Subtype, data = ans2)
summary(m_Subtype_norm)
## 
## Call:
## lm(formula = normY_d ~ Subtype, data = ans2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25448 -0.15448 -0.05448  0.14552  0.69520 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.279312   0.011346  24.619  < 2e-16 ***
## Subtype     -0.024836   0.006766  -3.671 0.000254 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1801 on 1009 degrees of freedom
## Multiple R-squared:  0.01318,    Adjusted R-squared:  0.0122 
## F-statistic: 13.48 on 1 and 1009 DF,  p-value: 0.0002544
std_err_sn <- coef(summary(m_Subtype_norm))[, "Std. Error"]
m_size_path <- lm(Y_d ~ Size_pathol, data = ans2)
summary(m_size_path)
## 
## Call:
## lm(formula = Y_d ~ Size_pathol, data = ans2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3013 -1.4678 -0.3680  0.9822  7.7988 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.36809    0.12893  18.368   <2e-16 ***
## Size_pathol -0.06675    0.04644  -1.437    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.886 on 1009 degrees of freedom
## Multiple R-squared:  0.002043,   Adjusted R-squared:  0.001054 
## F-statistic: 2.066 on 1 and 1009 DF,  p-value: 0.1509
std_err_sp <- coef(summary(m_size_path))[, "Std. Error"]
m_Hist_grade <- lm(Y_d ~ Hist_grade, data = ans2)
summary(m_Hist_grade)  #This one seems to differ 
## 
## Call:
## lm(formula = Y_d ~ Hist_grade, data = ans2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4707 -1.4899 -0.4091  0.9909  7.9909 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.85152    0.21333  13.367  < 2e-16 ***
## Hist_grade  -0.28079    0.08883  -3.161  0.00162 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.879 on 1009 degrees of freedom
## Multiple R-squared:  0.009806,   Adjusted R-squared:  0.008825 
## F-statistic: 9.993 on 1 and 1009 DF,  p-value: 0.001619
std_err_hg <- coef(summary(m_Hist_grade))[, "Std. Error"]
m_Subtype <- lm(Y_d ~ Subtype, data = ans2)
summary(m_Subtype)
## 
## Call:
## lm(formula = Y_d ~ Subtype, data = ans2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3162 -1.5162 -0.4162  0.9838  7.7838 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.5650     0.1182  21.697  < 2e-16 ***
## Subtype      -0.2487     0.0705  -3.528 0.000437 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.876 on 1009 degrees of freedom
## Multiple R-squared:  0.01219,    Adjusted R-squared:  0.01121 
## F-statistic: 12.45 on 1 and 1009 DF,  p-value: 0.000437
std_err_s <- coef(summary(m_Subtype))[, "Std. Error"]
#'In statistics, standardized (regression) coefficients, also called beta coefficients or beta weights, are the estimates resulting from a regression analysis where the underlying data have been standardized so that the variances of dependent and independent variables are equal to 1.' source: https://en.wikipedia.org/wiki/Standardized_coefficient, the lm.beta package allows us to calculate this easily, and will be done for the previously used variables. 
Std_beta_coeff_Y_norm_size <- lm.beta(m_size_path_norm)
summary(Std_beta_coeff_Y_norm_size)
## 
## Call:
## lm(formula = normY_d ~ Size_pathol, data = ans2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.26170 -0.14620 -0.04294  0.14483  0.64646 
## 
## Coefficients:
##              Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  0.263327     0.000000   0.012372   21.29   <2e-16 ***
## Size_pathol -0.008156    -0.057517   0.004457   -1.83   0.0675 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.181 on 1009 degrees of freedom
## Multiple R-squared:  0.003308,   Adjusted R-squared:  0.00232 
## F-statistic: 3.349 on 1 and 1009 DF,  p-value: 0.06754
coeff_size_norm <- Std_beta_coeff_Y_norm_size$coefficients
std_coeff_size_norm <- Std_beta_coeff_Y_norm_size$standardized.coefficients
Std_beta_coeff_Y_size <- lm.beta(m_size_path)
summary(Std_beta_coeff_Y_size)
## 
## Call:
## lm(formula = Y_d ~ Size_pathol, data = ans2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3013 -1.4678 -0.3680  0.9822  7.7988 
## 
## Coefficients:
##             Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  2.36809      0.00000    0.12893  18.368   <2e-16 ***
## Size_pathol -0.06675     -0.04520    0.04644  -1.437    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.886 on 1009 degrees of freedom
## Multiple R-squared:  0.002043,   Adjusted R-squared:  0.001054 
## F-statistic: 2.066 on 1 and 1009 DF,  p-value: 0.1509
coeff_size <- Std_beta_coeff_Y_size$coefficients
std_coeff_size <- Std_beta_coeff_Y_size$standardized.coefficients
std_beta_hist_norm <- lm.beta(m_Hist_grade_norm)
summary(std_beta_hist_norm)
## 
## Call:
## lm(formula = normY_d ~ Hist_grade, data = ans2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27989 -0.15185 -0.02381  0.12011  0.67619 
## 
## Coefficients:
##              Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  0.307928     0.000000   0.020476  15.039  < 2e-16 ***
## Hist_grade  -0.028039    -0.102983   0.008526  -3.289  0.00104 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1803 on 1009 degrees of freedom
## Multiple R-squared:  0.01061,    Adjusted R-squared:  0.009625 
## F-statistic: 10.82 on 1 and 1009 DF,  p-value: 0.001041
coeff_hist_norm <- std_beta_hist_norm$coefficients
std_coeff_hist_norm <- std_beta_hist_norm$standardized.coefficients
std_beta_hist <- lm.beta(m_Hist_grade)
summary(m_Hist_grade)
## 
## Call:
## lm(formula = Y_d ~ Hist_grade, data = ans2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4707 -1.4899 -0.4091  0.9909  7.9909 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.85152    0.21333  13.367  < 2e-16 ***
## Hist_grade  -0.28079    0.08883  -3.161  0.00162 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.879 on 1009 degrees of freedom
## Multiple R-squared:  0.009806,   Adjusted R-squared:  0.008825 
## F-statistic: 9.993 on 1 and 1009 DF,  p-value: 0.001619
coeff_hist <- std_beta_hist$coefficients
std_coeff_hist <- std_beta_hist$standardized.coefficients
std_beta_sub_norm <- lm.beta(m_Subtype_norm)
summary(std_beta_sub_norm)
## 
## Call:
## lm(formula = normY_d ~ Subtype, data = ans2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25448 -0.15448 -0.05448  0.14552  0.69520 
## 
## Coefficients:
##              Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)  0.279312     0.000000   0.011346  24.619  < 2e-16 ***
## Subtype     -0.024836    -0.114801   0.006766  -3.671 0.000254 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1801 on 1009 degrees of freedom
## Multiple R-squared:  0.01318,    Adjusted R-squared:  0.0122 
## F-statistic: 13.48 on 1 and 1009 DF,  p-value: 0.0002544
coeff_norm_sub <- std_beta_sub_norm$coefficients
std_coeff_norm_sub <- std_beta_sub_norm$standardized.coefficients
std_beta_sub <- lm.beta(m_Subtype)
summary(std_beta_sub)
## 
## Call:
## lm(formula = Y_d ~ Subtype, data = ans2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3162 -1.5162 -0.4162  0.9838  7.7838 
## 
## Coefficients:
##             Estimate Standardized Std. Error t value Pr(>|t|)    
## (Intercept)   2.5650       0.0000     0.1182  21.697  < 2e-16 ***
## Subtype      -0.2487      -0.1104     0.0705  -3.528 0.000437 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.876 on 1009 degrees of freedom
## Multiple R-squared:  0.01219,    Adjusted R-squared:  0.01121 
## F-statistic: 12.45 on 1 and 1009 DF,  p-value: 0.000437
coeff_sub <- std_beta_sub$coefficients
std_coeff_sub <- std_beta_sub$standardized.coefficients
## Now creating the table that is seen in the paper for inferential stats
## (beta, standard err, standardized beta). Pulling all of the 'outputs'
## out of the summary table and creating a tibble to bind together into
## the larger table.
samp_2_summary <- as_tibble(t(bind_rows(coeff_hist, std_err_hg, std_coeff_hist)),
    .name_repair = "minimal")
samp_2_summary1 <- as_tibble(t(bind_rows(coeff_hist_norm, std_err_hgn, std_coeff_hist_norm)),
    .name_repair = "minimal")
samp_3_summary <- as_tibble(t(bind_rows(coeff_size, std_err_sp, std_coeff_size)),
    .name_repair = "minimal")
samp_3_summary1 <- as_tibble(t(bind_rows(coeff_size_norm, std_err_pn, std_coeff_size_norm)),
    .name_repair = "minimal")
samp_4_summary <- as_tibble(t(bind_rows(coeff_sub, std_err_s, std_coeff_sub)),
    .name_repair = "minimal")
samp_4_summary1 <- as_tibble(t(bind_rows(coeff_norm_sub, std_err_sn, std_coeff_norm_sub)),
    .name_repair = "minimal")
samp_2_summary <- samp_2_summary[-c(1), ]
samp_3_summary <- samp_3_summary[-c(1), ]
samp_4_summary <- samp_4_summary[-c(1), ]
samp_2_summary1 <- samp_2_summary1[-c(1), ]
samp_3_summary1 <- samp_3_summary1[-c(1), ]
samp_4_summary1 <- samp_4_summary1[-c(1), ]
Y_dist_data <- bind_rows(samp_2_summary, samp_3_summary, samp_4_summary, samp_2_summary1,
    samp_3_summary1, samp_4_summary1)
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
names(Y_dist_data) <- c("Beta Coefficient", "Standard Error", "Standardized Beta Coefficient")
variables <- tibble(Variable = c("Absolute y-axis Histological Grade", "Normalized y-axis Histological Grade",
    "Absolute y-axis Tumor Size", "Normalized y-axis Tumor Size", "Absolute y-axis Triple Negative",
    " Normalized y-axis Triple Negative"))
Y_dist_data <- bind_cols(variables, Y_dist_data)
kable(Y_dist_data, digits = 3) %>%
    kable_styling(font_size = 12, full_width = FALSE)
Variable Beta Coefficient Standard Error Standardized Beta Coefficient
Absolute y-axis Histological Grade -0.281 0.089 -0.099
Normalized y-axis Histological Grade -0.067 0.046 -0.045
Absolute y-axis Tumor Size -0.249 0.070 -0.110
Normalized y-axis Tumor Size -0.028 0.009 -0.103
Absolute y-axis Triple Negative -0.008 0.004 -0.058
Normalized y-axis Triple Negative -0.025 0.007 -0.115
Y_dist_data
## # A tibble: 6 × 4
##   Variable              `Beta Coefficie… `Standard Error` `Standardized Beta Co…
##   <chr>                            <dbl>            <dbl>                  <dbl>
## 1 "Absolute y-axis His…         -0.281            0.0888                 -0.0990
## 2 "Normalized y-axis H…         -0.0668           0.0464                 -0.0452
## 3 "Absolute y-axis Tum…         -0.249            0.0705                 -0.110 
## 4 "Normalized y-axis T…         -0.0280           0.00853                -0.103 
## 5 "Absolute y-axis Tri…         -0.00816          0.00446                -0.0575
## 6 " Normalized y-axis …         -0.0248           0.00677                -0.115
knitr::include_graphics("Kim_et_al_photos/inferential_table.jpg")

# DATA VISUALIZATION Do data visualization now. No real visualization in
# the paper, so I chose a 3D plot of the positional data of the different
# cancer subtypes for both normalized and absolute. This allows me to
# visualize the distribution of where each tumor is located and how the
# normalization shifts the positions of the data.
Norm_ER__plot <- scatter3D(ER_normX, ER_normY, ER_normZ, clab = c("ER Normalized",
    "Position (cm)"))

Abs_ER__plot <- scatter3D(ER_X, ER_Y, ER_Z, clab = c(" ER Absolute", "Position (cm)"))

Norm_TN__plot <- scatter3D(Triple_neg_normX, Triple_neg_normY, Triple_neg_normZ,
    clab = c("TN Normalized", "Position (cm)"))

Abs_TN__plot <- scatter3D(Triple_neg_X, Triple_neg_Y, Triple_neg_Z, clab = c("TN Absolute",
    "Position (cm)"))

## When the positional data is normalized, it looks very 'normal' (go
## figure!), there is no skewness to it, which makes sense since it all is
## normalized within the spread of the data, so the center should be at
## point zero.
## Because the data is categorical, I decided that it didn't make much
## sense to plot all of them as scatter plots, but that a dot and whisker
## plot might be more intuitive. The dots represent the regression
## coefficients of the two separate models and the whiskers represent the
## 99% confidence intervals (as specified in the code)
dwplot(list(m_Hist_grade, m_Hist_grade_norm, m_Subtype, m_Subtype_norm, m_size_path,
    m_size_path_norm), ci = 0.99, show_intercept = FALSE) + scale_y_discrete(labels = c(Subtype = "",
    Hist_grade = "Histological Grade", Size_pathol = "Subtype")) + scale_color_hue(labels = c("Normalized y-axis position",
    "Absolute y-axis position", "Normalized y-axis position", "Absolute y-axis position",
    "Normalized y-axis position", "Absolute y-axis position")) + theme(plot.title = element_text(face = "bold")) +
    ggtitle("Regression Coefficients for Breast Tumor Y-axis positions") + xlab("Standardized Beta Coefficient")

## I don't know why the labels on the y axis have to be switched to be
## correct... I went back and checked and the models earlier and they are
## labelled correctly and everything. Not super familiar with dotwhisker.
# Normalized y-axis and Tumor Size
plot(m_size_path_norm)

# Absolute y-axis and Tumor Size
plot(m_size_path)